{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Metadata Extraction and Augmentation w/ Marvin\n",
    "\n",
    "This notebook walks through using [`Marvin`](https://github.com/PrefectHQ/marvin) to extract and augment metadata from text. Marvin uses the LLM to identify and extract metadata.  Metadata can be anything from additional and enhanced questions and answers to business object identification and elaboration.  This notebook will demonstrate pulling out and elaborating on Sports Supplement information in a csv document.\n",
    "\n",
    "Note: You will need to supply a valid open ai key below to run this notebook."
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%pip install llama-index-llms-openai\n",
    "%pip install llama-index-extractors-marvin"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip install marvin"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.core import SimpleDirectoryReader\n",
    "from llama_index.llms.openai import OpenAI\n",
    "from llama_index.core.node_parser import TokenTextSplitter\n",
    "from llama_index.extractors.marvin import MarvinMetadataExtractor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import nest_asyncio\n",
    "\n",
    "nest_asyncio.apply()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import openai\n",
    "\n",
    "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "documents = SimpleDirectoryReader(\"data\").load_data()\n",
    "\n",
    "# limit document text length\n",
    "documents[0].text = documents[0].text[:10000]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import marvin\n",
    "\n",
    "from pydantic import BaseModel, Field\n",
    "\n",
    "marvin.settings.openai.api_key = os.environ[\"OPENAI_API_KEY\"]\n",
    "marvin.settings.openai.chat.completions.model = \"gpt-4o\"\n",
    "\n",
    "\n",
    "class SportsSupplement(BaseModel):\n",
    "    name: str = Field(..., description=\"The name of the sports supplement\")\n",
    "    description: str = Field(\n",
    "        ..., description=\"A description of the sports supplement\"\n",
    "    )\n",
    "    pros_cons: str = Field(\n",
    "        ..., description=\"The pros and cons of the sports supplement\"\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Parsing nodes: 100%|██████████| 1/1 [00:00<00:00, 41.49it/s]\n",
      "Extracting marvin metadata: 100%|██████████| 9/9 [00:22<00:00,  2.46s/it]\n"
     ]
    }
   ],
   "source": [
    "# construct text splitter to split texts into chunks for processing\n",
    "# this takes a while to process, you can increase processing time by using larger chunk_size\n",
    "# file size is a factor too of course\n",
    "node_parser = TokenTextSplitter(\n",
    "    separator=\" \", chunk_size=512, chunk_overlap=128\n",
    ")\n",
    "\n",
    "# create metadata extractor\n",
    "metadata_extractor = MarvinMetadataExtractor(\n",
    "    marvin_model=SportsSupplement\n",
    ")  # let's extract custom entities for each node.\n",
    "\n",
    "# use node_parser to get nodes from the documents\n",
    "from llama_index.core.ingestion import IngestionPipeline\n",
    "\n",
    "pipeline = IngestionPipeline(transformations=[node_parser, metadata_extractor])\n",
    "\n",
    "nodes = pipeline.run(documents=documents, show_progress=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'creation_date': '2024-08-07',\n",
      " 'file_name': 'Sports Supplements.csv',\n",
      " 'file_path': '/data001/home/dongwoo.jeong/llama_index/docs/examples/metadata_extraction/data/Sports '\n",
      "              'Supplements.csv',\n",
      " 'file_size': 62403,\n",
      " 'file_type': 'text/csv',\n",
      " 'last_modified_date': '2024-08-07',\n",
      " 'marvin_metadata': {'description': 'L-arginine alpha-ketoglutarate is a '\n",
      "                                    'supplement often used to improve peak '\n",
      "                                    'power output and strength–power during '\n",
      "                                    'weight training. A 2006 study by Campbell '\n",
      "                                    'et al. found that AAKG supplementation '\n",
      "                                    'improved maximum effort 1-repetition '\n",
      "                                    'bench press and Wingate peak power '\n",
      "                                    'performance.',\n",
      "                     'name': 'AAKG',\n",
      "                     'pros_cons': 'Pros: Improves peak power output and '\n",
      "                                  'strength–power. Cons: No significant effect '\n",
      "                                  'on body composition, aerobic capacity, or '\n",
      "                                  'muscle endurance.'}}\n",
      "{'creation_date': '2024-08-07',\n",
      " 'file_name': 'Sports Supplements.csv',\n",
      " 'file_path': '/data001/home/dongwoo.jeong/llama_index/docs/examples/metadata_extraction/data/Sports '\n",
      "              'Supplements.csv',\n",
      " 'file_size': 62403,\n",
      " 'file_type': 'text/csv',\n",
      " 'last_modified_date': '2024-08-07',\n",
      " 'marvin_metadata': {'description': 'Baking soda, also known as bicarbonate of '\n",
      "                                    'soda or sodium bicarbonate (NaHCO3), is '\n",
      "                                    'used to enhance high-intensity '\n",
      "                                    'performance in anaerobic activities such '\n",
      "                                    'as rowing, cycling, swimming, and '\n",
      "                                    'running. It works by making the blood '\n",
      "                                    'more alkaline, which can improve '\n",
      "                                    'performance in lactic-acid-fueled events '\n",
      "                                    'like the 800m sprint.',\n",
      "                     'name': 'Baking soda',\n",
      "                     'pros_cons': 'Pros: Improves performance in '\n",
      "                                  'high-intensity, anaerobic activities. Cons: '\n",
      "                                  'Can cause a badly upset stomach.'}}\n",
      "{'creation_date': '2024-08-07',\n",
      " 'file_name': 'Sports Supplements.csv',\n",
      " 'file_path': '/data001/home/dongwoo.jeong/llama_index/docs/examples/metadata_extraction/data/Sports '\n",
      "              'Supplements.csv',\n",
      " 'file_size': 62403,\n",
      " 'file_type': 'text/csv',\n",
      " 'last_modified_date': '2024-08-07',\n",
      " 'marvin_metadata': {'description': 'Branched-chain amino acids (BCAAs) are '\n",
      "                                    'essential nutrients that the body obtains '\n",
      "                                    'from proteins found in food, especially '\n",
      "                                    'meat, dairy products, and legumes. They '\n",
      "                                    'include leucine, isoleucine, and valine.',\n",
      "                     'name': 'BCAAs',\n",
      "                     'pros_cons': 'Pros: May help with fatigue resistance, '\n",
      "                                  'aerobic endurance, and performance in '\n",
      "                                  'activities like cycling and circuit '\n",
      "                                  'training. Cons: Limited evidence on '\n",
      "                                  'long-term benefits and potential side '\n",
      "                                  'effects.'}}\n",
      "{'creation_date': '2024-08-07',\n",
      " 'file_name': 'Sports Supplements.csv',\n",
      " 'file_path': '/data001/home/dongwoo.jeong/llama_index/docs/examples/metadata_extraction/data/Sports '\n",
      "              'Supplements.csv',\n",
      " 'file_size': 62403,\n",
      " 'file_type': 'text/csv',\n",
      " 'last_modified_date': '2024-08-07',\n",
      " 'marvin_metadata': {'description': 'Branched-chain amino acids (BCAAs) are '\n",
      "                                    'essential nutrients that the body obtains '\n",
      "                                    'from proteins found in food, especially '\n",
      "                                    'meat, dairy products, and legumes. They '\n",
      "                                    'include leucine, isoleucine, and valine. '\n",
      "                                    'BCAAs are commonly used to improve '\n",
      "                                    'exercise performance and reduce protein '\n",
      "                                    'and muscle breakdown during intense '\n",
      "                                    'exercise.',\n",
      "                     'name': 'BCAAs',\n",
      "                     'pros_cons': 'Pros: \\n'\n",
      "                                  '1. May improve aerobic performance, '\n",
      "                                  'endurance, power, and strength.\\n'\n",
      "                                  '2. Can enhance immune defenses in athletes '\n",
      "                                  'and general fitness.\\n'\n",
      "                                  '3. Useful for various types of exercise '\n",
      "                                  'including cycling and running.\\n'\n",
      "                                  '\\n'\n",
      "                                  'Cons: \\n'\n",
      "                                  '1. Limited evidence on long-term benefits.\\n'\n",
      "                                  '2. Potential for overconsumption leading to '\n",
      "                                  'imbalances.\\n'\n",
      "                                  '3. Some studies show no significant '\n",
      "                                  'benefit.'}}\n",
      "{'creation_date': '2024-08-07',\n",
      " 'file_name': 'Sports Supplements.csv',\n",
      " 'file_path': '/data001/home/dongwoo.jeong/llama_index/docs/examples/metadata_extraction/data/Sports '\n",
      "              'Supplements.csv',\n",
      " 'file_size': 62403,\n",
      " 'file_type': 'text/csv',\n",
      " 'last_modified_date': '2024-08-07',\n",
      " 'marvin_metadata': {'description': 'Branched-chain amino acids (BCAAs) are '\n",
      "                                    'essential nutrients that the body obtains '\n",
      "                                    'from proteins found in food, especially '\n",
      "                                    'meat, dairy products, and legumes. They '\n",
      "                                    'include leucine, isoleucine, and valine.',\n",
      "                     'name': 'BCAAs',\n",
      "                     'pros_cons': 'Pros: May support immune defenses in '\n",
      "                                  'athletes, aid in general fitness, assist in '\n",
      "                                  'running, swimming, and rowing, and help '\n",
      "                                  'with body composition, fat burning, muscle '\n",
      "                                  'building, muscle damage, soreness, '\n",
      "                                  'recovery, and injury prevention. Cons: '\n",
      "                                  'Effectiveness can vary based on individual '\n",
      "                                  'response and specific use case.'}}\n"
     ]
    }
   ],
   "source": [
    "from pprint import pprint\n",
    "\n",
    "for i in range(5):\n",
    "    pprint(nodes[i].metadata)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
